Exercise 1 : load some data and look at their structures

1.1

Import characters.csv, episodes.csv, scenes.csv and appearances.csv files from the data directory and store them in variables of the same name.
library(readr)
library(dplyr)
library(tidyr)
library(tcltk)
characters = read_csv("got/data/characters.csv")
episodes = read_csv("got/data/episodes.csv")
scenes = read_csv("got/data/scenes.csv")
appearances = read_csv("got/data/appearances.csv")

1.2

Look at the size of the data.frame appearances, observe the variables common to the scenes and appearances tables. Use the str and summary functions on the different tables to understand their structures and relationships.
summary(characters)
     name               sex               house             killedBy        
 Length:587         Length:587         Length:587         Length:587        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
    image          
 Length:587        
 Class :character  
 Mode  :character  
str(characters)
tibble [587 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ name    : chr [1:587] "Addam Marbrand" "Adrack Humble" "Aeron Greyjoy" "Aerys Targaryen" ...
 $ sex     : chr [1:587] "male" "male" "male" "male" ...
 $ house   : chr [1:587] NA NA "Greyjoy" NA ...
 $ killedBy: chr [1:587] NA NA NA NA ...
 $ image   : chr [1:587] NA NA "https://images-na.ssl-images-amazon.com/images/M/MV5BNzI5MDg0ZDAtN2Y2ZC00MzU1LTgyYjQtNTBjYjEzODczZDVhXkEyXkFqcG"| __truncated__ NA ...
 - attr(*, "spec")=
  .. cols(
  ..   name = col_character(),
  ..   sex = col_character(),
  ..   house = col_character(),
  ..   killedBy = col_character(),
  ..   image = col_character()
  .. )
summary(episodes)
 episodeTitle         episodeNum       seasonNum       episodeId  total_duration
 Length:73          Min.   : 1.000   Min.   :1.000   Min.   : 1   Min.   :2696  
 Class :character   1st Qu.: 3.000   1st Qu.:2.000   1st Qu.:19   1st Qu.:2969  
 Mode  :character   Median : 5.000   Median :4.000   Median :37   Median :3142  
                    Mean   : 5.192   Mean   :4.205   Mean   :37   Mean   :3204  
                    3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:55   3rd Qu.:3279  
                    Max.   :10.000   Max.   :8.000   Max.   :73   Max.   :4510  
str(episodes)
tibble [73 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ episodeTitle  : chr [1:73] "Winter Is Coming" "The Kingsroad" "Lord Snow" "Cripples, Bastards, and Broken Things" ...
 $ episodeNum    : num [1:73] 1 2 3 4 5 6 7 8 9 10 ...
 $ seasonNum     : num [1:73] 1 1 1 1 1 1 1 1 1 1 ...
 $ episodeId     : num [1:73] 1 2 3 4 5 6 7 8 9 10 ...
 $ total_duration: num [1:73] 3509 3142 3257 3164 3073 ...
 - attr(*, "spec")=
  .. cols(
  ..   episodeTitle = col_character(),
  ..   episodeNum = col_double(),
  ..   seasonNum = col_double(),
  ..   episodeId = col_double(),
  ..   total_duration = col_double()
  .. )
summary(scenes) 
  sceneStart         sceneEnd          location         subLocation       
 Length:3840       Length:3840       Length:3840        Length:3840       
 Class1:hms        Class1:hms        Class :character   Class :character  
 Class2:difftime   Class2:difftime   Mode  :character   Mode  :character  
 Mode  :numeric    Mode  :numeric                                         
                                                                          
                                                                          
   episodeId        duration           nbc            sceneId          nbdeath       
 Min.   : 1.00   Min.   :  1.00   Min.   : 1.000   Min.   :   1.0   Min.   :0.00000  
 1st Qu.:27.00   1st Qu.: 14.00   1st Qu.: 2.000   1st Qu.: 960.8   1st Qu.:0.00000  
 Median :46.00   Median : 36.00   Median : 2.000   Median :1920.5   Median :0.00000  
 Mean   :44.18   Mean   : 60.91   Mean   : 3.155   Mean   :1920.5   Mean   :0.09792  
 3rd Qu.:65.00   3rd Qu.: 86.00   3rd Qu.: 4.000   3rd Qu.:2880.2   3rd Qu.:0.00000  
 Max.   :73.00   Max.   :661.00   Max.   :24.000   Max.   :3840.0   Max.   :7.00000  
str(scenes)
tibble [3,840 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ sceneStart : 'hms' num [1:3840] 00:00:40 00:01:45 00:03:24 00:03:31 ...
  ..- attr(*, "units")= chr "secs"
 $ sceneEnd   : 'hms' num [1:3840] 00:01:45 00:03:24 00:03:31 00:03:38 ...
  ..- attr(*, "units")= chr "secs"
 $ location   : chr [1:3840] "The Wall" "North of the Wall" "North of the Wall" "North of the Wall" ...
 $ subLocation: chr [1:3840] "Castle Black" "The Haunted Forest" "The Haunted Forest" "The Haunted Forest" ...
 $ episodeId  : num [1:3840] 1 1 1 1 1 1 1 1 1 1 ...
 $ duration   : num [1:3840] 65 99 7 7 112 5 7 10 23 18 ...
 $ nbc        : num [1:3840] 3 3 2 1 3 1 2 1 3 1 ...
 $ sceneId    : num [1:3840] 1 2 3 4 5 6 7 8 9 10 ...
 $ nbdeath    : num [1:3840] 0 0 1 0 0 0 0 0 1 0 ...
 - attr(*, "spec")=
  .. cols(
  ..   sceneStart = col_time(format = ""),
  ..   sceneEnd = col_time(format = ""),
  ..   location = col_character(),
  ..   subLocation = col_character(),
  ..   episodeId = col_double(),
  ..   duration = col_double(),
  ..   nbc = col_double(),
  ..   sceneId = col_double(),
  ..   nbdeath = col_double()
  .. )
summary(appearances)
     name              sceneId    
 Length:12114       Min.   :   1  
 Class :character   1st Qu.: 937  
 Mode  :character   Median :1842  
                    Mean   :1859  
                    3rd Qu.:2820  
                    Max.   :3840  
str(appearances)
tibble [12,114 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ name   : chr [1:12114] "Gared" "Waymar Royce" "Will" "Gared" ...
 $ sceneId: num [1:12114] 1 1 1 2 2 2 3 3 4 5 ...
 - attr(*, "spec")=
  .. cols(
  ..   name = col_character(),
  ..   sceneId = col_double()
  .. )

Exercise 2 : some R base commands

2.1

Use the scenes table to calculate the number of dead characters in the whole series.
sum(scenes$nbdeath)
[1] 376

2.2

Use the scenes table to calculate the number of dead characters in the first season.
sum(scenes$nbdeath[(scenes$episodeId <= sum(episodes$seasonNum == 1))])
[1] 32
sum(scenes$nbdeath[scenes$episodeId %in% episodes$episodeId[which(episodes$seasonNum == 1)]])
[1] 32

2.3

Use the characters table to find the 5 biggest murderers of the series.
sort(table(characters$killedBy),TRUE)[1:5]

          Jon Snow Daenerys Targaryen         Arya Stark     Sandor Clegane 
                12                 11                 10                  9 
  Cersei Lannister 
                 7 

2.4

Find the length of the longest scene and the id of the episode.
scenes[which.max(scenes$duration),]
# A tibble: 1 x 9
  sceneStart sceneEnd location    subLocation     episodeId duration   nbc sceneId nbdeath
  <time>     <time>   <chr>       <chr>               <dbl>    <dbl> <dbl>   <dbl>   <dbl>
1 45'58"     56'59"   The Crownl… Outside King's…        73      661    18    3795       0

Exercise 3 : dplyr

3.1

Find the longest scene duration and episode id using dplyr this time.
scenes %>%
  arrange(desc(duration)) %>%
  head(1) %>%
  select(duration, episodeId)
# A tibble: 1 x 2
  duration episodeId
     <dbl>     <dbl>
1      661        73

3.2

Find the characters in the longest scene.
scenes %>%
  arrange(desc(duration)) %>%
  head(1) %>%
  left_join(appearances, by=c("sceneId")) %>%
  select(name)
# A tibble: 18 x 1
   name            
   <chr>           
 1 Tyrion Lannister
 2 Grey Worm       
 3 Samwell Tarly   
 4 Edmure Tully    
 5 Arya Stark      
 6 Bran Stark      
 7 Sansa Stark     
 8 Brienne of Tarth
 9 Davos Seaworth  
10 Gendry          
11 Yara Greyjoy    
12 Robin Arryn     
13 Yohn Royce      
14 Dornish Lord    
15 Lord #1         
16 Lord #2         
17 Lord #3         
18 Lord #4         

3.3

Find the most visited place.
scenes %>%
  group_by(location) %>%
  summarise(nbscenes = n()) %>%
  arrange(desc(nbscenes)) %>%
  head(1) %>%
  select(location)
# A tibble: 1 x 1
  location      
  <chr>         
1 The Crownlands

3.4

How many scenes take place in Port Real ?
scenes %>%
  filter(location == "Port Real") %>%
  group_by(location) %>%
  summarise(nbscenes = n())
# A tibble: 0 x 2
# … with 2 variables: location <chr>, nbscenes <int>

3.5

Find the precise location (subLocation) where the most people die?
scenes %>%
  group_by(subLocation) %>%
  summarise(nd = sum(nbdeath)) %>%
  arrange(desc(nd))
# A tibble: 97 x 2
   subLocation           nd
   <chr>              <dbl>
 1 King's Landing        74
 2 Winterfell            60
 3 <NA>                  51
 4 Castle Black          41
 5 The Twins             12
 6 The Haunted Forest     9
 7 The Wall               9
 8 Craster's Keep         8
 9 The Wolfswood          8
10 Dragonstone            6
# … with 87 more rows

3.6

Find the episode where Jon Snow has the longuest screen time.
appearances %>%
  filter(name == "Jon Snow") %>%
  left_join(scenes) %>%
  left_join(episodes) %>%
  group_by(name, episodeId, episodeTitle) %>%
  summarise(screenTime = sum(duration)) %>%
  arrange(desc(screenTime)) %>%
  head(1)
# A tibble: 1 x 4
# Groups:   name, episodeId [1]
  name     episodeId episodeTitle    screenTime
  <chr>        <dbl> <chr>                <dbl>
1 Jon Snow        73 The Iron Throne       2526

3.7

how many characters do have more than 30 minutes of screen time ?
appearances %>%
  left_join(scenes) %>%
  group_by(name) %>%
  summarise(screenTime = sum(duration)) %>%
  filter(screenTime>30*60) %>%
  nrow()
[1] 98

3.8

Which characters do have the more scenes together.
appearances %>%
  left_join(appearances, by=c("sceneId"="sceneId")) %>%
  filter(name.x != name.y) %>%
  group_by(name.x,name.y) %>%
  summarise(nbscenes = n()) %>%
  arrange(desc(nbscenes))
# A tibble: 8,472 x 3
# Groups:   name.x [576]
   name.x             name.y             nbscenes
   <chr>              <chr>                 <int>
 1 Daenerys Targaryen Drogon                  172
 2 Drogon             Daenerys Targaryen      172
 3 Daenerys Targaryen Jorah Mormont           148
 4 Jorah Mormont      Daenerys Targaryen      148
 5 Jon Snow           Tormund Giantsbane      145
 6 Tormund Giantsbane Jon Snow                145
 7 Lord Varys         Tyrion Lannister        141
 8 Tyrion Lannister   Lord Varys              141
 9 Davos Seaworth     Jon Snow                136
10 Jon Snow           Davos Seaworth          136
# … with 8,462 more rows

3.9

Which two characters spend the most time together?
appearances %>%
  left_join(appearances, by=c("sceneId"="sceneId")) %>%
  filter(name.x != name.y) %>%
  left_join(scenes %>% select(sceneId,duration)) %>%
  group_by(name.x,name.y) %>%
  summarise(screenTime = sum(duration)) %>%
  arrange(desc(screenTime))
# A tibble: 8,472 x 3
# Groups:   name.x [576]
   name.x             name.y             screenTime
   <chr>              <chr>                   <dbl>
 1 Daenerys Targaryen Jorah Mormont           12923
 2 Jorah Mormont      Daenerys Targaryen      12923
 3 Lord Varys         Tyrion Lannister        10764
 4 Tyrion Lannister   Lord Varys              10764
 5 Davos Seaworth     Jon Snow                10380
 6 Jon Snow           Davos Seaworth          10380
 7 Daenerys Targaryen Missandei                9924
 8 Missandei          Daenerys Targaryen       9924
 9 Jon Snow           Tormund Giantsbane       9352
10 Tormund Giantsbane Jon Snow                 9352
# … with 8,462 more rows

3.10

Build a data.frame with one line per character containing a name column and a column for each place with the duration of presence of each character. If a character has never been in a place the value is equal to 0.
df <- scenes %>%
  left_join(appearances) %>%
  group_by(name,location) %>%
  summarise(screenTime = sum(duration)) %>%
  pivot_wider(values_from=screenTime,names_from=location,values_fill=c("duration"=0))

df
# A tibble: 577 x 27
# Groups:   name [577]
   name  `The Riverlands` `The North` `The Iron Islan… `North of the W… `The Red Waste`
   <chr>            <dbl>       <dbl>            <dbl>            <dbl>           <dbl>
 1 Adda…               76           0                0                0               0
 2 Adel…                0          36                0                0               0
 3 Adra…                0         190                0                0               0
 4 Aero…                0           0              601                0               0
 5 Aery…                0           0                0               23               0
 6 Aggo                 0           0                0                0             185
 7 Akho                 0           0                0                0               0
 8 Alli…                0           0                0                0               0
 9 Alto…               92           0                0                0               0
10 Alys…                0        1050                0                0               0
# … with 567 more rows, and 21 more variables: `The Dothraki Sea` <dbl>, `Vaes
#   Dothrak` <dbl>, `The Wall` <dbl>, `The Crownlands` <dbl>, `The Westerlands` <dbl>,
#   Braavos <dbl>, `The Vale` <dbl>, `The Reach` <dbl>, Dorne <dbl>, `The Sunset
#   Sea` <dbl>, Meereen <dbl>, Astapor <dbl>, Yunkai <dbl>, `The Stormlands` <dbl>,
#   Volantis <dbl>, Pentos <dbl>, Qarth <dbl>, `The Narrow Sea` <dbl>, `The Shivering
#   Sea` <dbl>, `The Summer Sea` <dbl>, Valyria <dbl>

3.11

Construct from the previous data.frame a matrix containing only the numerical variables. Filter it to keep only the lines whose sum is higher than 3600. Normalize it so that the sums in lines are equal to 1. Give the name of each character kept to the corresponding line in the matrix with the function rownames.
X <- as.matrix(df[,-1])
Xs <- X[rowSums(X)>3600,]
Xns <- Xs/rowSums(Xs)
rownames(Xns) <- df$name[rowSums(X)>3600]

Xns
                      The Riverlands  The North The Iron Islands North of the Wall
Alliser Thorne           0.000000000 0.00000000        0.0000000      0.0000000000
Arya Stark               0.365864693 0.22249640        0.0000000      0.0000000000
Barristan Selmy          0.000000000 0.00000000        0.0000000      0.0000000000
                      The Red Waste The Dothraki Sea Vaes Dothrak    The Wall
Alliser Thorne           0.00000000       0.00000000   0.00000000 1.000000000
Arya Stark               0.00000000       0.00000000   0.00000000 0.000000000
Barristan Selmy          0.00000000       0.00000000   0.00000000 0.000000000
                      The Crownlands The Westerlands    Braavos   The Vale  The Reach
Alliser Thorne            0.00000000     0.000000000 0.00000000 0.00000000 0.00000000
Arya Stark                0.16981287     0.000000000 0.20608678 0.03240798 0.00000000
Barristan Selmy           0.29420709     0.000000000 0.00000000 0.00000000 0.00000000
                             Dorne The Sunset Sea    Meereen    Astapor     Yunkai
Alliser Thorne        0.0000000000    0.000000000 0.00000000 0.00000000 0.00000000
Arya Stark            0.0000000000    0.003331277 0.00000000 0.00000000 0.00000000
Barristan Selmy       0.0000000000    0.000000000 0.38680249 0.13744868 0.18154174
                      The Stormlands   Volantis     Pentos      Qarth The Narrow Sea
Alliser Thorne           0.000000000 0.00000000 0.00000000 0.00000000   0.0000000000
Arya Stark               0.000000000 0.00000000 0.00000000 0.00000000   0.0000000000
Barristan Selmy          0.000000000 0.00000000 0.00000000 0.00000000   0.0000000000
                      The Shivering Sea The Summer Sea     Valyria
Alliser Thorne              0.000000000   0.0000000000 0.000000000
Arya Stark                  0.000000000   0.0000000000 0.000000000
Barristan Selmy             0.000000000   0.0000000000 0.000000000
 [ reached getOption("max.print") -- omitted 52 rows ]

3.12

Using the function `dist calculate the manhatan distance between each line of the previous matrix. Then perform a hierarchical clustering with this distance matrix and display the result. You should get a figure similar to the following one :
hc <- hclust(dist(Xns,method="manhattan"))
X11()
plot(hc, main = "Clustering of the main characters (geographical profiles)", sub = "@roortheroor, 2020", xlab = "")

capture <- tk_messageBox(message = "close plot", detail = "")